Load the necessary library packages prior to loading the data.

Packages used include :

-tidyr

-dplyr

-plotly

-plotly

-forcats


1. Read the beers data file Beers.csv and inspect the dataframe (head and tail).
library(readr)
Beers <- read_csv("Beers.csv")
## Parsed with column specification:
## cols(
##   Name = col_character(),
##   Beer_ID = col_double(),
##   ABV = col_double(),
##   IBU = col_double(),
##   Brewery_id = col_double(),
##   Style = col_character(),
##   Ounces = col_double()
## )
head(Beers)
## # A tibble: 6 x 7
##   Name           Beer_ID   ABV   IBU Brewery_id Style                Ounces
##   <chr>            <dbl> <dbl> <dbl>      <dbl> <chr>                 <dbl>
## 1 Pub Beer          1436 0.05     NA        409 American Pale Lager      12
## 2 Devil's Cup       2265 0.066    NA        178 American Pale Ale (…     12
## 3 Rise of the P…    2264 0.071    NA        178 American IPA             12
## 4 Sinister          2263 0.09     NA        178 American Double / I…     12
## 5 Sex and Candy     2262 0.075    NA        178 American IPA             12
## 6 Black Exodus      2261 0.077    NA        178 Oatmeal Stout            12
tail(Beers)
## # A tibble: 6 x 7
##   Name               Beer_ID   ABV   IBU Brewery_id Style            Ounces
##   <chr>                <dbl> <dbl> <dbl>      <dbl> <chr>             <dbl>
## 1 Rocky Mountain Oy…    1035 0.075    NA        425 American Stout       12
## 2 Belgorado              928 0.067    45        425 Belgian IPA          12
## 3 Rail Yard Ale          807 0.052    NA        425 American Amber …     12
## 4 B3K Black Lager        620 0.055    NA        425 Schwarzbier          12
## 5 Silverback Pale A…     145 0.055    40        425 American Pale A…     12
## 6 Rail Yard Ale (20…      84 0.052    NA        425 American Amber …     12
2. Read the breweries data file Breweries.csv and inspect the dataframe (head and tail).
library(readr)
Breweries <- read_csv("Breweries.csv")
## Parsed with column specification:
## cols(
##   Brew_ID = col_double(),
##   Name = col_character(),
##   City = col_character(),
##   State = col_character()
## )
head(Breweries)
## # A tibble: 6 x 4
##   Brew_ID Name                      City          State
##     <dbl> <chr>                     <chr>         <chr>
## 1       1 NorthGate Brewing         Minneapolis   MN   
## 2       2 Against the Grain Brewery Louisville    KY   
## 3       3 Jack's Abby Craft Lagers  Framingham    MA   
## 4       4 Mike Hess Brewing Company San Diego     CA   
## 5       5 Fort Point Beer Company   San Francisco CA   
## 6       6 COAST Brewing Company     Charleston    SC
tail(Breweries)
## # A tibble: 6 x 4
##   Brew_ID Name                          City          State
##     <dbl> <chr>                         <chr>         <chr>
## 1     553 Mickey Finn's Brewery         Libertyville  IL   
## 2     554 Covington Brewhouse           Covington     LA   
## 3     555 Dave's Brewfarm               Wilson        WI   
## 4     556 Ukiah Brewing Company         Ukiah         CA   
## 5     557 Butternuts Beer and Ale       Garrattsville NY   
## 6     558 Sleeping Lady Brewing Company Anchorage     AK
3. Inspect the data structures, rename columns that make sense and prepare to join the data sets. Joining could be accomplished with a left, right or full join. Full join is used in this case to preserve data from both tables.
str(Breweries)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 558 obs. of  4 variables:
##  $ Brew_ID: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Name   : chr  "NorthGate Brewing" "Against the Grain Brewery" "Jack's Abby Craft Lagers" "Mike Hess Brewing Company" ...
##  $ City   : chr  "Minneapolis" "Louisville" "Framingham" "San Diego" ...
##  $ State  : chr  "MN" "KY" "MA" "CA" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Brew_ID = col_double(),
##   ..   Name = col_character(),
##   ..   City = col_character(),
##   ..   State = col_character()
##   .. )
str(Beers)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 2410 obs. of  7 variables:
##  $ Name      : chr  "Pub Beer" "Devil's Cup" "Rise of the Phoenix" "Sinister" ...
##  $ Beer_ID   : num  1436 2265 2264 2263 2262 ...
##  $ ABV       : num  0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
##  $ IBU       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Brewery_id: num  409 178 178 178 178 178 178 178 178 178 ...
##  $ Style     : chr  "American Pale Lager" "American Pale Ale (APA)" "American IPA" "American Double / Imperial IPA" ...
##  $ Ounces    : num  12 12 12 12 12 12 12 12 12 12 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Name = col_character(),
##   ..   Beer_ID = col_double(),
##   ..   ABV = col_double(),
##   ..   IBU = col_double(),
##   ..   Brewery_id = col_double(),
##   ..   Style = col_character(),
##   ..   Ounces = col_double()
##   .. )
names(Breweries)
## [1] "Brew_ID" "Name"    "City"    "State"
Renamed Brewery_id to match the Brew_ID inthe Breweries dataset.
names(Beers)
## [1] "Name"       "Beer_ID"    "ABV"        "IBU"        "Brewery_id"
## [6] "Style"      "Ounces"
Beers <- rename(Beers, Brew_ID = Brewery_id)
names(Beers)
## [1] "Name"    "Beer_ID" "ABV"     "IBU"     "Brew_ID" "Style"   "Ounces"
Full join method.
fullData <- full_join(Breweries, Beers, by = "Brew_ID")
head(fullData)
## # A tibble: 6 x 10
##   Brew_ID Name.x   City   State Name.y  Beer_ID   ABV   IBU Style    Ounces
##     <dbl> <chr>    <chr>  <chr> <chr>     <dbl> <dbl> <dbl> <chr>     <dbl>
## 1       1 NorthGa… Minne… MN    Get To…    2692 0.045    50 America…     16
## 2       1 NorthGa… Minne… MN    Maggie…    2691 0.049    26 Milk / …     16
## 3       1 NorthGa… Minne… MN    Wall's…    2690 0.048    19 English…     16
## 4       1 NorthGa… Minne… MN    Pumpion    2689 0.06     38 Pumpkin…     16
## 5       1 NorthGa… Minne… MN    Strong…    2688 0.06     25 America…     16
## 6       1 NorthGa… Minne… MN    Parape…    2687 0.056    47 Extra S…     16
Another column renaming is needed to correct the data names after the join.
fullData <- rename(fullData, BreweryName = Name.x, BeerName = Name.y)
names(fullData)
##  [1] "Brew_ID"     "BreweryName" "City"        "State"       "BeerName"   
##  [6] "Beer_ID"     "ABV"         "IBU"         "Style"       "Ounces"
fullData %>%
  group_by(State) %>%
  summarise(count = n())%>%
  arrange(desc(count))
## # A tibble: 51 x 2
##    State count
##    <chr> <int>
##  1 CO      265
##  2 CA      183
##  3 MI      162
##  4 IN      139
##  5 TX      130
##  6 OR      125
##  7 PA      100
##  8 IL       91
##  9 WI       87
## 10 MA       82
## # … with 41 more rows
Exploring the number of breweries in each state, the Breweries.csv data is used. Grouped the Breweries data table by states.
BrewStatesC <- Breweries%>%
  group_by(State)%>%
  summarise(count = n())%>%
  arrange(desc(count))
head(BrewStatesC)
## # A tibble: 6 x 2
##   State count
##   <chr> <int>
## 1 CO       47
## 2 CA       39
## 3 MI       32
## 4 OR       29
## 5 TX       28
## 6 PA       25
state.freq <- table(Breweries$State)
barplot(state.freq[order(state.freq)], 
        horiz = T,
        border = NA,
        xlim = c(0, 100),
        main = "Number of Breweries per State",
        xlab = "Number of Brewiews",
        ylab = "States")

#
# Counting the number of brewweries in each state:

head(state.freq)
## 
## AK AL AR AZ CA CO 
##  7  3  2 11 39 47
sDF<-data.frame(state.freq) 
sDF<- rename(sDF, State = Var1, Brew_Count=Freq)


Plot the number of breweries in each state.
p1 <- ggplot(Breweries, aes(x = fct_rev(fct_infreq(State)), fill = fct_infreq(State))) + geom_bar() + coord_flip() + theme(axis.title.x=element_blank(),                                                                                                                   axis.title.y=element_blank(),legend.position = "none")


ggplotly(p1)  
Plot the number of beer brands from each state.
p2 <- ggplot(fullData, aes(x = fct_rev(fct_infreq(State)), fill = fct_rev(fct_infreq(State)))) + geom_bar()  + theme(legend.position = "none") +
  labs( title = "", x = "none", y = "Count of Beer") + theme(axis.title.x=element_blank(),                                                                                                                   axis.title.y=element_blank(),legend.position = "none")

ggplotly(p2)
Did we have any empty rows in our data tables? Let’s find out here:
naB_Id<-sum(is.na(fullData$Brew_ID))
naB_Nm<-sum(is.na(fullData$BreweryName))
naB_C<-sum(is.na(fullData$City))
naB_S<-sum(is.na(fullData$State))
naBeer_Nm<-sum(is.na(fullData$BeerName))
naBeer_Id<-sum(is.na(fullData$Beer_ID))
naABV<-sum(is.na(fullData$ABV))
naIBU<-sum(is.na(fullData$IBU))
naStyle<-sum(is.na(fullData$Style))
naOunces<-sum(is.na(fullData$Ounces))


naResults = paste(naB_Id, naB_Nm, naB_C, naB_S, naBeer_Nm, naBeer_Id, naABV, naIBU, naStyle, naOunces, sep = ", ")

print(naResults)
## [1] "0, 0, 0, 0, 0, 0, 62, 1005, 5, 0"
names(fullData)
##  [1] "Brew_ID"     "BreweryName" "City"        "State"       "BeerName"   
##  [6] "Beer_ID"     "ABV"         "IBU"         "Style"       "Ounces"
select(fullData, State, ABV, IBU)
## # A tibble: 2,410 x 3
##    State   ABV   IBU
##    <chr> <dbl> <dbl>
##  1 MN    0.045    50
##  2 MN    0.049    26
##  3 MN    0.048    19
##  4 MN    0.06     38
##  5 MN    0.06     25
##  6 MN    0.056    47
##  7 KY    0.08     68
##  8 KY    0.125    80
##  9 KY    0.077    25
## 10 KY    0.042    42
## # … with 2,400 more rows
Breweries %>%
  group_by(State) %>%
  tally(sort = TRUE)
## # A tibble: 51 x 2
##    State     n
##    <chr> <int>
##  1 CO       47
##  2 CA       39
##  3 MI       32
##  4 OR       29
##  5 TX       28
##  6 PA       25
##  7 MA       23
##  8 WA       23
##  9 IN       22
## 10 WI       20
## # … with 41 more rows
fullData %>%
  group_by(State) %>%
  summarise(state_count = n()) %>%
  arrange(desc(state_count))
## # A tibble: 51 x 2
##    State state_count
##    <chr>       <int>
##  1 CO            265
##  2 CA            183
##  3 MI            162
##  4 IN            139
##  5 TX            130
##  6 OR            125
##  7 PA            100
##  8 IL             91
##  9 WI             87
## 10 MA             82
## # … with 41 more rows
mFD <- fullData %>%
  group_by(State) %>%
  summarise_each(funs(mean), mean_ABV = ABV, mean_IBU = IBU) %>%
  arrange(desc(mean_ABV))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.
ggplot(fullData, aes(x = ABV)) + 
  geom_bar(na.rm = T) + 
  theme_grey() + 
   coord_flip()

p11<-ggplot(fullData, aes(x = IBU, fill = "red")) + 
  geom_bar(na.rm = T) + 
  scale_color_brewer("Blues")
ggplotly(p11)
fullData %>%
  group_by(State) %>%
  summarise_each(funs(min, max), ABV) 
## # A tibble: 51 x 3
##    State    min    max
##    <chr>  <dbl>  <dbl>
##  1 AK     0.048  0.068
##  2 AL     0.05   0.093
##  3 AR     0.04   0.061
##  4 AZ    NA     NA    
##  5 CA    NA     NA    
##  6 CO    NA     NA    
##  7 CT     0.034  0.09 
##  8 DC     0.05   0.092
##  9 DE    NA     NA    
## 10 FL    NA     NA    
## # … with 41 more rows
mFDt<- data.frame(fullData %>%
  group_by(State) %>%
  summarise ( min_ABV =min(ABV), mean_ABV = mean(ABV), max_ABV = max(ABV)) %>%
  arrange(desc(max_ABV)))
mIBU<- fullData %>%
  group_by(State) %>%
    summarise (min_IBU =min(IBU), mean_IBU = mean(IBU), max_IBU = max(IBU)) %>%
  arrange(desc(max_IBU))
fullData %>%
    group_by(BreweryName) %>%
    summarise ( min_IBU =min(IBU), mean_IBU = mean(IBU), max_IBU = max(IBU)) %>%
  arrange(desc(max_IBU))
## # A tibble: 551 x 4
##    BreweryName                        min_IBU mean_IBU max_IBU
##    <chr>                                <dbl>    <dbl>   <dbl>
##  1 Astoria Brewing Company                 35     62       138
##  2 Wolf Hills Brewing Company             135    135       135
##  3 Cape Ann Brewing Company                30     60.4     130
##  4 Christian Moerlein Brewing Company     126    126       126
##  5 Buffalo Bayou Brewing Company           20     59.3     118
##  6 Modern Times Beer                       30     64.5     115
##  7 Beach Brewing Company                  108    108       108
##  8 Seven Brides Brewery                    55     80       105
##  9 Alameda Brewing                         20     53       103
## 10 Good People Brewing Company             36     58.6     103
## # … with 541 more rows
coef(lm(IBU ~ ABV, data = fullData))
## (Intercept)         ABV 
##   -34.09865  1282.03706

Colored by State

ggplot(fullData, aes(ABV, IBU, colour = State)) +
     geom_point(na.rm = T) +
  geom_abline(intercept = -34.09865, slope = 1282.037)

ggplot(fullData, aes(ABV, IBU, colour = ABV)) +
     geom_point(aes(colour = ABV), na.rm = T) +
  geom_abline(intercept = -34.09865, slope = 1282.037)

#### Color by state

ggplot(fullData, aes(ABV, IBU, colour = State)) +
  facet_wrap(~ State) +
    geom_point(na.rm = T) +
  geom_abline(intercept = -34.09865, slope = 1282.037) 

#### Color by alcohol volume

ggplot(fullData, aes(ABV, IBU, colour = State)) +
  facet_wrap(~ State) +
    geom_point(aes(colour = ABV),na.rm = TRUE) +
  geom_abline(intercept = -34.09865, slope = 1282.037)

ggplot(fullData, aes(ABV, IBU, colour = State)) +
  facet_wrap(~ Style) +
    geom_point(aes(colour = ABV),na.rm = TRUE) +
  geom_abline(intercept = -34.09865, slope = 1282.037)

ggplot(fullData, aes(State, ABV)) +
  geom_boxplot(na.rm = T)

ggplot(fullData, aes(State, IBU)) +
  geom_boxplot(na.rm = T) 

ggplot(fullData, aes(ABV, IBU, colour = ABV)) +
     geom_point(aes(colour = ABV), na.rm = T) +
  geom_abline(intercept = -34.09865, slope = 1282.037) + 
  geom_smooth(method = lm, se = T, na.rm = T)

#####Density map of ABV vs IBU.

dplot<-ggplot(fullData, aes(ABV, IBU, colour = ABV)) +
     geom_point( na.rm = T)

dplot + stat_density_2d(aes(fill= ..level.., alpha = 0.75 ) , na.rm = T, geom = "polygon")